In this case study, we will be working with missing data using the Boston house-price data. We will simply be exploring the change in results by performing imputations for the missing values for multiple variables.
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
from IPython.display import Markdown as md
import seaborn as sns
def take_two(data,
null_cols=[],
test_col='AGE',
cond='>10',
impute_func=np.mean,
samples=[.1, .2, .3]):
assert callable(impute_func)
assert isinstance(test_col, str)
assert isinstance(cond, str)
data_frames = {}
for sample_rate in samples:
df = data.copy()
replace_index = df.query(f'{test_col}{cond}').sample(
frac=sample_rate, replace=False).index
print(f'Percent of imputed values:{(len(replace_index)/len(df.index)):,.2%}')
df.loc[replace_index, null_cols] = None
imputed = impute_func(df.loc[df.index.difference(replace_index),
null_cols],
axis=0)
print(f'{imputed}\n{impute_func}\n{sample_rate}', end='\n')
[
df[x].fillna(imputed[i], inplace=True)
for i, x in enumerate(null_cols)
]
data_frames.update({sample_rate: df})
return data_frames
# load data
boston = load_boston()
# display data descriptions
md(boston.DESCR)
# create independent and dependent dataframes
df = pd.DataFrame(boston.data, columns=boston.feature_names) # features dataframe
df_targets = pd.DataFrame(boston.target, columns=['target']) # targets dataframe
# partition data
X_train, X_test, y_train, y_test = train_test_split(df, df_targets.target, test_size=0.33, random_state=7337)
fig, ax = plt.subplots(figsize=(12,12))
im = ax.matshow(df.corr())
ax.set_xticks(np.arange(len(df.columns))) # force to show all x labels
ax.set_yticks(np.arange(len(df.columns))) # force to show all y labels
ax.set_xticklabels(df.columns)
ax.set_yticklabels(df.columns)
fig.colorbar(im)
plt.show()
Based on the correlation matrix above, we chose to interpret the following variables:
B - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town LSTAT - % lower status of the population Age - proportion of owner-occupied units built prior to 1940
profile = ProfileReport(df, title="Boston Features Profile")
profile.to_notebook_iframe()